import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from dateutil import parser
import string
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\lalac\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
Dataset = Customer Support tweets
This dataset contains the tweets received and replied from a customer support chatbot, this dataset include tweets from 702777 companies. The dataset has the following columns:
The purpose of this project is to clean, analyze, and understand company responses made by modern customer support. The analysis will focus on the top 20 companies by the number of responses, tweets by date, response time, and a small text analysis that could classify a tweet.
Connecting the data with this the notebook
df2 = pd.read_csv("C:/Users/lalac/OneDrive/Documents/Open Avenues project/twcs.csv")
df2.head()
| tweet_id | author_id | inbound | created_at | text | response_tweet_id | in_response_to_tweet_id | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | sprintcare | False | Tue Oct 31 22:10:47 +0000 2017 | @115712 I understand. I would like to assist y... | 2 | 3.0 |
| 1 | 2 | 115712 | True | Tue Oct 31 22:11:45 +0000 2017 | @sprintcare and how do you propose we do that | NaN | 1.0 |
| 2 | 3 | 115712 | True | Tue Oct 31 22:08:27 +0000 2017 | @sprintcare I have sent several private messag... | 1 | 4.0 |
| 3 | 4 | sprintcare | False | Tue Oct 31 21:54:49 +0000 2017 | @115712 Please send us a Private Message so th... | 3 | 5.0 |
| 4 | 5 | 115712 | True | Tue Oct 31 21:49:35 +0000 2017 | @sprintcare I did. | 4 | 6.0 |
"Data exploration is the first step of data analysis used to explore and visualize data to uncover insights from the start or identify areas or patterns to dig into more"
#Shape of DataFrame
df2.shape #Number of rows and columns
(2811774, 7)
#datatype for each column
df2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2811774 entries, 0 to 2811773 Data columns (total 7 columns): # Column Dtype --- ------ ----- 0 tweet_id int64 1 author_id object 2 inbound bool 3 created_at object 4 text object 5 response_tweet_id object 6 in_response_to_tweet_id float64 dtypes: bool(1), float64(1), int64(1), object(4) memory usage: 131.4+ MB
#finding null values
df2.isnull().sum()
tweet_id 0 author_id 0 inbound 0 created_at 0 text 0 response_tweet_id 1040629 in_response_to_tweet_id 794335 dtype: int64
#finding row duplicates
duplicate2= df2.duplicated()
print(duplicate2.sum())
df2[duplicate2]
0
| tweet_id | author_id | inbound | created_at | text | response_tweet_id | in_response_to_tweet_id |
|---|
1.1 How many tweets are inbound and outbound responses?
#Counting the number of tweets thar are inbound (created by customer)
inbound2 = df2[df2['inbound']==True].count()
inbound2
tweet_id 1537843 author_id 1537843 inbound 1537843 created_at 1537843 text 1537843 response_tweet_id 1303829 in_response_to_tweet_id 750497 dtype: int64
#Counting the number of tweets thar are outbound (respond by company)
outbound2 = df2[df2['inbound']==False].count()
outbound2
tweet_id 1273931 author_id 1273931 inbound 1273931 created_at 1273931 text 1273931 response_tweet_id 467316 in_response_to_tweet_id 1266942 dtype: int64
For this dataset 1537873 tweets are inbound (received by customer) and 1273931 tweets are outbound (responded by company)
# create a graph
1.2 How many are unique tweets?
#Total number of tweets, adding inbound plus outbound tweets
total_number_of_tweets = 1537873 + 1273931
total_number_of_tweets
2811804
#Selecting 'text' columns and finding tweets duplicated
select_tweet2= df2['text']
duplicate_tweet2=select_tweet2.duplicated().sum()
duplicate_tweet2
29156
The number of tweets duplicated base on the text message is 29156
unique_tweet2 = total_number_of_tweets - duplicate_tweet2
unique_tweet2
2782648
There are 2782618 unique tweets
The data cleaning process for this chatbot dataset is going to be divided in 4 steps.
2.1 Cleaning null values in_response_to_tweet_id when the author is a company
We are deleting these rows because they are not responses to customers tweets
# Deleting rows if there is a null value in in_response_to_tweet_id when the inbound is a company (inbound == False)
filtered_df = df2.drop(df2[(df2["inbound"] == False) & (df2['in_response_to_tweet_id'].isnull())].index)
filtered_df
| tweet_id | author_id | inbound | created_at | text | response_tweet_id | in_response_to_tweet_id | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | sprintcare | False | Tue Oct 31 22:10:47 +0000 2017 | @115712 I understand. I would like to assist y... | 2 | 3.0 |
| 1 | 2 | 115712 | True | Tue Oct 31 22:11:45 +0000 2017 | @sprintcare and how do you propose we do that | NaN | 1.0 |
| 2 | 3 | 115712 | True | Tue Oct 31 22:08:27 +0000 2017 | @sprintcare I have sent several private messag... | 1 | 4.0 |
| 3 | 4 | sprintcare | False | Tue Oct 31 21:54:49 +0000 2017 | @115712 Please send us a Private Message so th... | 3 | 5.0 |
| 4 | 5 | 115712 | True | Tue Oct 31 21:49:35 +0000 2017 | @sprintcare I did. | 4 | 6.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2811769 | 2987947 | sprintcare | False | Wed Nov 22 08:43:51 +0000 2017 | @823869 Hey, we'd be happy to look into this f... | NaN | 2987948.0 |
| 2811770 | 2987948 | 823869 | True | Wed Nov 22 08:35:16 +0000 2017 | @115714 wtf!? I’ve been having really shitty s... | 2987947 | NaN |
| 2811771 | 2812240 | 121673 | True | Thu Nov 23 04:13:07 +0000 2017 | @143549 @sprintcare You have to go to https://... | NaN | 2812239.0 |
| 2811772 | 2987949 | AldiUK | False | Wed Nov 22 08:31:24 +0000 2017 | @823870 Sounds delicious, Sarah! 😋 https://t.c... | NaN | 2987950.0 |
| 2811773 | 2987950 | 823870 | True | Tue Nov 21 22:01:04 +0000 2017 | @AldiUK warm sloe gin mince pies with ice cre... | 2987951,2987949 | NaN |
2804785 rows × 7 columns
2.2 Finding false customers
There are customers (inbound == true) that have created many tweets and they may be considered as a company instead
#Getting all the customer inbound = true, and finding the ones with a most tweets
customers= filtered_df[(filtered_df["inbound"] == True)]
false_customer = customers.groupby(['author_id']).count().sort_values(by='tweet_id', ascending = False)
num_tweets_false_customer= false_customer['in_response_to_tweet_id']
num_tweets_false_customer
author_id
115911 1124
120576 1002
115913 179
116230 432
169172 446
...
549508 0
549507 0
345292 0
549505 0
263844 0
Name: in_response_to_tweet_id, Length: 702669, dtype: int64
#Top 20 of author_id that could be companies
false_customer.head(20)
| tweet_id | inbound | created_at | text | response_tweet_id | in_response_to_tweet_id | |
|---|---|---|---|---|---|---|
| author_id | ||||||
| 115911 | 1286 | 1286 | 1286 | 1286 | 1286 | 1124 |
| 120576 | 1010 | 1010 | 1010 | 1010 | 669 | 1002 |
| 115913 | 563 | 563 | 563 | 563 | 561 | 179 |
| 116230 | 454 | 454 | 454 | 454 | 454 | 432 |
| 169172 | 448 | 448 | 448 | 448 | 447 | 446 |
| 117627 | 406 | 406 | 406 | 406 | 302 | 382 |
| 115888 | 332 | 332 | 332 | 332 | 332 | 1 |
| 116136 | 295 | 295 | 295 | 295 | 290 | 247 |
| 116421 | 276 | 276 | 276 | 276 | 218 | 172 |
| 115722 | 252 | 252 | 252 | 252 | 252 | 228 |
| 115714 | 250 | 250 | 250 | 250 | 250 | 114 |
| 115850 | 243 | 243 | 243 | 243 | 243 | 91 |
| 115990 | 243 | 243 | 243 | 243 | 243 | 232 |
| 121239 | 210 | 210 | 210 | 210 | 130 | 152 |
| 203476 | 197 | 197 | 197 | 197 | 128 | 175 |
| 115725 | 194 | 194 | 194 | 194 | 194 | 147 |
| 115798 | 181 | 181 | 181 | 181 | 179 | 178 |
| 127296 | 176 | 176 | 176 | 176 | 98 | 153 |
| 170351 | 174 | 174 | 174 | 174 | 35 | 174 |
| 169916 | 172 | 172 | 172 | 172 | 172 | 172 |
# Exploring some of the false customers
cust_115911= filtered_df[(filtered_df["author_id"] == "115911")]
cust_115911
#This should be a company base on the text
| tweet_id | author_id | inbound | created_at | text | response_tweet_id | in_response_to_tweet_id | |
|---|---|---|---|---|---|---|---|
| 622 | 1068 | 115911 | True | Tue Oct 31 22:19:12 +0000 2017 | @115910 No sir! You can still preorder at http... | 1067 | 1069.0 |
| 624 | 1070 | 115911 | True | Tue Oct 31 22:16:12 +0000 2017 | @115910 Go for it Trevor *ErikS | 1069 | 1071.0 |
| 626 | 1072 | 115911 | True | Thu Oct 26 21:13:40 +0000 2017 | Be ready for your next iPhone. Join our iPhone... | 1071,1073,1074,1075,1076,1077,1078,1079,1080,1... | NaN |
| 1346 | 1950 | 115911 | True | Tue Oct 31 22:27:18 +0000 2017 | @116149 I mean...it does say to "submit your c... | 1949 | 1951.0 |
| 1348 | 1952 | 115911 | True | Tue Oct 31 13:00:17 +0000 2017 | Happy Halloween! Submit your costume pics with... | 1953,1954,1955,1956,1957,1958,1959,1960,1961,1... | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2797680 | 2974233 | 115911 | True | Thu Nov 30 02:24:15 +0000 2017 | @820276 Come back to your #MagentaFam! 🙌👍😊 *JPL | 2974232 | 2974234.0 |
| 2802786 | 2979188 | 115911 | True | Tue Oct 31 20:26:47 +0000 2017 | @576616 You can block known scam numbers by di... | 2979187 | 2979189.0 |
| 2802788 | 2979190 | 115911 | True | Tue Oct 31 20:20:33 +0000 2017 | @576616 We're working to cut down on scam call... | 2979189 | 2979191.0 |
| 2811004 | 2987195 | 115911 | True | Tue Nov 21 21:31:55 +0000 2017 | @823653 Of course, Shawn! Where are you going ... | 2987194 | 2987196.0 |
| 2811319 | 2987507 | 115911 | True | Wed Nov 22 01:09:14 +0000 2017 | @823739 Ouch. That's not the resolution we wan... | 2987506 | 2987508.0 |
1286 rows × 7 columns
#Exploring another customer
cust_176402 = filtered_df[(filtered_df["author_id"] == '176402')]
cust_176402.head(5)
#this could be a customer
| tweet_id | author_id | inbound | created_at | text | response_tweet_id | in_response_to_tweet_id | |
|---|---|---|---|---|---|---|---|
| 218496 | 254108 | 176402 | True | Thu Oct 05 22:32:41 +0000 2017 | @AppleSupport @176401 I have the same problem ... | NaN | 254107.0 |
| 220081 | 255805 | 176402 | True | Thu Oct 05 21:29:36 +0000 2017 | @AppleSupport @176842 11.0.2 stinks Apple just... | NaN | 255804.0 |
| 224830 | 260869 | 176402 | True | Thu Oct 05 21:33:22 +0000 2017 | @AppleSupport @178108 Kevin I honestly think t... | NaN | 260868.0 |
| 224833 | 260873 | 176402 | True | Thu Oct 05 21:34:06 +0000 2017 | @AppleSupport @178109 Over two weeks people ar... | 260875 | 260872.0 |
| 224834 | 260875 | 176402 | True | Thu Oct 05 21:34:23 +0000 2017 | @AppleSupport @178109 They want us all to go b... | NaN | 260873.0 |
# Outliner max num the tweets from false customers
num_tweets_false_customer.max()
1124
Based on the Top 20 false_customer dataframe, the number of tweets from these false customers won't change the future steps of this analysis, therefore we will not modify the filtered_df dataset we have until here.
2.3. Cleaning Date format
The 'created_at' column date format needs to be modified in order to have a data easier to understand and work with.
#Changing the format of the date column
filtered_df['created_at'] = filtered_df['created_at'].apply(lambda x: parser.parse(x))
filtered_df.head(5)
| tweet_id | author_id | inbound | created_at | text | response_tweet_id | in_response_to_tweet_id | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | sprintcare | False | 2017-10-31 22:10:47+00:00 | @115712 I understand. I would like to assist y... | 2 | 3.0 |
| 1 | 2 | 115712 | True | 2017-10-31 22:11:45+00:00 | @sprintcare and how do you propose we do that | NaN | 1.0 |
| 2 | 3 | 115712 | True | 2017-10-31 22:08:27+00:00 | @sprintcare I have sent several private messag... | 1 | 4.0 |
| 3 | 4 | sprintcare | False | 2017-10-31 21:54:49+00:00 | @115712 Please send us a Private Message so th... | 3 | 5.0 |
| 4 | 5 | 115712 | True | 2017-10-31 21:49:35+00:00 | @sprintcare I did. | 4 | 6.0 |
2.4 Text cleaning
Text cleaning is really important in this data analysis, therefore we will do different steps to get a simpler 'text' message/tweet. The 'text' column is the only one that is going to be modified here
#Make sure all the values in the column text are string type
filtered_df["text"] = filtered_df["text"].astype(str)
2.4.1 Text to Lower casing to make sure all the text data is the treated the same way
#convert the text to lower case to have all the text in the same case format, so all the text data is the treated the same way
filtered_df["text_lower"] = filtered_df["text"].str.lower()
filtered_df.head(5)
| tweet_id | author_id | inbound | created_at | text | response_tweet_id | in_response_to_tweet_id | text_lower | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | sprintcare | False | 2017-10-31 22:10:47+00:00 | @115712 I understand. I would like to assist y... | 2 | 3.0 | @115712 i understand. i would like to assist y... |
| 1 | 2 | 115712 | True | 2017-10-31 22:11:45+00:00 | @sprintcare and how do you propose we do that | NaN | 1.0 | @sprintcare and how do you propose we do that |
| 2 | 3 | 115712 | True | 2017-10-31 22:08:27+00:00 | @sprintcare I have sent several private messag... | 1 | 4.0 | @sprintcare i have sent several private messag... |
| 3 | 4 | sprintcare | False | 2017-10-31 21:54:49+00:00 | @115712 Please send us a Private Message so th... | 3 | 5.0 | @115712 please send us a private message so th... |
| 4 | 5 | 115712 | True | 2017-10-31 21:49:35+00:00 | @sprintcare I did. | 4 | 6.0 | @sprintcare i did. |
2.4.2 Removal of Punctuations
#Making an array with the punctuation symbols
punctuations = string.punctuation
print(punctuations)
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
#Creating a function to remove all the puntuaction symbols above in the text column
def remove_punctuations(text):
"""Function to remove the punctuation symbols"""
return text.translate(str.maketrans('', '',punctuations))
#Applying the punctuation function
filtered_df["text_wo_punct"] = filtered_df["text_lower"].apply(lambda text: remove_punctuations(text))
filtered_df.head()
| tweet_id | author_id | inbound | created_at | text | response_tweet_id | in_response_to_tweet_id | text_lower | text_wo_punct | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | sprintcare | False | 2017-10-31 22:10:47+00:00 | @115712 I understand. I would like to assist y... | 2 | 3.0 | @115712 i understand. i would like to assist y... | 115712 i understand i would like to assist you... |
| 1 | 2 | 115712 | True | 2017-10-31 22:11:45+00:00 | @sprintcare and how do you propose we do that | NaN | 1.0 | @sprintcare and how do you propose we do that | sprintcare and how do you propose we do that |
| 2 | 3 | 115712 | True | 2017-10-31 22:08:27+00:00 | @sprintcare I have sent several private messag... | 1 | 4.0 | @sprintcare i have sent several private messag... | sprintcare i have sent several private message... |
| 3 | 4 | sprintcare | False | 2017-10-31 21:54:49+00:00 | @115712 Please send us a Private Message so th... | 3 | 5.0 | @115712 please send us a private message so th... | 115712 please send us a private message so tha... |
| 4 | 5 | 115712 | True | 2017-10-31 21:49:35+00:00 | @sprintcare I did. | 4 | 6.0 | @sprintcare i did. | sprintcare i did |
2.4.3 Removal of stopwords
Remove commonly occuring words that don't provide valuable information for downstream analysis (i.e "the, so, a")
#creating an array with stopwords in english
stopwords_set = set(stopwords.words('english'))
#defining a function to remove the stopwords
def remove_stopwords(text):
"""Function to remove the stopwords"""
return " ".join([word for word in str(text).split() if word not in stopwords_set])
#applying the remove_stopwords function to the dataset
filtered_df["text_wo_stop"] = filtered_df["text_wo_punct"].apply(lambda text: remove_stopwords(text))
filtered_df.head()
| tweet_id | author_id | inbound | created_at | text | response_tweet_id | in_response_to_tweet_id | text_lower | text_wo_punct | text_wo_stop | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | sprintcare | False | 2017-10-31 22:10:47+00:00 | @115712 I understand. I would like to assist y... | 2 | 3.0 | @115712 i understand. i would like to assist y... | 115712 i understand i would like to assist you... | 115712 understand would like assist would need... |
| 1 | 2 | 115712 | True | 2017-10-31 22:11:45+00:00 | @sprintcare and how do you propose we do that | NaN | 1.0 | @sprintcare and how do you propose we do that | sprintcare and how do you propose we do that | sprintcare propose |
| 2 | 3 | 115712 | True | 2017-10-31 22:08:27+00:00 | @sprintcare I have sent several private messag... | 1 | 4.0 | @sprintcare i have sent several private messag... | sprintcare i have sent several private message... | sprintcare sent several private messages one r... |
| 3 | 4 | sprintcare | False | 2017-10-31 21:54:49+00:00 | @115712 Please send us a Private Message so th... | 3 | 5.0 | @115712 please send us a private message so th... | 115712 please send us a private message so tha... | 115712 please send us private message assist c... |
| 4 | 5 | 115712 | True | 2017-10-31 21:49:35+00:00 | @sprintcare I did. | 4 | 6.0 | @sprintcare i did. | sprintcare i did | sprintcare |
2.4.4 Removal of most common words
#Finding the most common words
cnt = Counter()
#Counting how many times that word appears in the text column
for text in filtered_df["text_wo_stop"].values:
for word in text.split():
cnt[word]+=1
#The 15 most common words
cnt.most_common(15)
[('us', 450175),
('please', 401652),
('dm', 334913),
('help', 266991),
('hi', 223944),
('thanks', 206182),
('get', 199961),
('sorry', 191799),
('like', 146129),
('know', 145027),
('look', 139418),
('send', 138746),
('amazonhelp', 137271),
('well', 133716),
('service', 133340)]
#removing common words
common_words = set([w for (w, wc) in cnt.most_common(15)])
#defining the remove_commonwords function
def remove_commonwords(text):
"""Function to remove the frequent words"""
return " ".join([word for word in str(text).split() if word not in common_words])
#applying remove_commonwords function to dataframe
filtered_df["text_wo_common"] = filtered_df["text_wo_stop"].apply(lambda text: remove_commonwords(text))
filtered_df.head()
| tweet_id | author_id | inbound | created_at | text | response_tweet_id | in_response_to_tweet_id | text_lower | text_wo_punct | text_wo_stop | text_wo_common | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | sprintcare | False | 2017-10-31 22:10:47+00:00 | @115712 I understand. I would like to assist y... | 2 | 3.0 | @115712 i understand. i would like to assist y... | 115712 i understand i would like to assist you... | 115712 understand would like assist would need... | 115712 understand would assist would need priv... |
| 1 | 2 | 115712 | True | 2017-10-31 22:11:45+00:00 | @sprintcare and how do you propose we do that | NaN | 1.0 | @sprintcare and how do you propose we do that | sprintcare and how do you propose we do that | sprintcare propose | sprintcare propose |
| 2 | 3 | 115712 | True | 2017-10-31 22:08:27+00:00 | @sprintcare I have sent several private messag... | 1 | 4.0 | @sprintcare i have sent several private messag... | sprintcare i have sent several private message... | sprintcare sent several private messages one r... | sprintcare sent several private messages one r... |
| 3 | 4 | sprintcare | False | 2017-10-31 21:54:49+00:00 | @115712 Please send us a Private Message so th... | 3 | 5.0 | @115712 please send us a private message so th... | 115712 please send us a private message so tha... | 115712 please send us private message assist c... | 115712 private message assist click ‘message’ ... |
| 4 | 5 | 115712 | True | 2017-10-31 21:49:35+00:00 | @sprintcare I did. | 4 | 6.0 | @sprintcare i did. | sprintcare i did | sprintcare | sprintcare |
2.4.5 Removal of URLs
#Defining a function to remove_urls
def remove_urls(text):
# Regular expression pattern to match URLs
url_pattern = re.compile(r'https?://\S+|www\.\S+')
# Replace URLs with empty string
text_without_urls = re.sub(url_pattern, '', text)
return text_without_urls
#applying remove_urls function to dataframe
filtered_df["text_wo_url"] = filtered_df["text_wo_common"].apply(lambda text: remove_urls(text))
filtered_df.head()
| tweet_id | author_id | inbound | created_at | text | response_tweet_id | in_response_to_tweet_id | text_lower | text_wo_punct | text_wo_stop | text_wo_common | text_wo_url | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | sprintcare | False | 2017-10-31 22:10:47+00:00 | @115712 I understand. I would like to assist y... | 2 | 3.0 | @115712 i understand. i would like to assist y... | 115712 i understand i would like to assist you... | 115712 understand would like assist would need... | 115712 understand would assist would need priv... | 115712 understand would assist would need priv... |
| 1 | 2 | 115712 | True | 2017-10-31 22:11:45+00:00 | @sprintcare and how do you propose we do that | NaN | 1.0 | @sprintcare and how do you propose we do that | sprintcare and how do you propose we do that | sprintcare propose | sprintcare propose | sprintcare propose |
| 2 | 3 | 115712 | True | 2017-10-31 22:08:27+00:00 | @sprintcare I have sent several private messag... | 1 | 4.0 | @sprintcare i have sent several private messag... | sprintcare i have sent several private message... | sprintcare sent several private messages one r... | sprintcare sent several private messages one r... | sprintcare sent several private messages one r... |
| 3 | 4 | sprintcare | False | 2017-10-31 21:54:49+00:00 | @115712 Please send us a Private Message so th... | 3 | 5.0 | @115712 please send us a private message so th... | 115712 please send us a private message so tha... | 115712 please send us private message assist c... | 115712 private message assist click ‘message’ ... | 115712 private message assist click ‘message’ ... |
| 4 | 5 | 115712 | True | 2017-10-31 21:49:35+00:00 | @sprintcare I did. | 4 | 6.0 | @sprintcare i did. | sprintcare i did | sprintcare | sprintcare | sprintcare |
2.4.6 Remove Emoji
#function to remove emoji
def remove_emoji(text):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0" # dingbat symbols
u"\U000024C2-\U0001F251" # range of characters
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
#applying remove_emoji function to dataframe
filtered_df["text_wo_emoji"] = filtered_df["text_wo_url"].apply(lambda text: remove_emoji(text))
filtered_df.head()
| tweet_id | author_id | inbound | created_at | text | response_tweet_id | in_response_to_tweet_id | text_lower | text_wo_punct | text_wo_stop | text_wo_common | text_wo_url | text_wo_emoji | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | sprintcare | False | 2017-10-31 22:10:47+00:00 | @115712 I understand. I would like to assist y... | 2 | 3.0 | @115712 i understand. i would like to assist y... | 115712 i understand i would like to assist you... | 115712 understand would like assist would need... | 115712 understand would assist would need priv... | 115712 understand would assist would need priv... | 115712 understand would assist would need priv... |
| 1 | 2 | 115712 | True | 2017-10-31 22:11:45+00:00 | @sprintcare and how do you propose we do that | NaN | 1.0 | @sprintcare and how do you propose we do that | sprintcare and how do you propose we do that | sprintcare propose | sprintcare propose | sprintcare propose | sprintcare propose |
| 2 | 3 | 115712 | True | 2017-10-31 22:08:27+00:00 | @sprintcare I have sent several private messag... | 1 | 4.0 | @sprintcare i have sent several private messag... | sprintcare i have sent several private message... | sprintcare sent several private messages one r... | sprintcare sent several private messages one r... | sprintcare sent several private messages one r... | sprintcare sent several private messages one r... |
| 3 | 4 | sprintcare | False | 2017-10-31 21:54:49+00:00 | @115712 Please send us a Private Message so th... | 3 | 5.0 | @115712 please send us a private message so th... | 115712 please send us a private message so tha... | 115712 please send us private message assist c... | 115712 private message assist click ‘message’ ... | 115712 private message assist click ‘message’ ... | 115712 private message assist click ‘message’ ... |
| 4 | 5 | 115712 | True | 2017-10-31 21:49:35+00:00 | @sprintcare I did. | 4 | 6.0 | @sprintcare i did. | sprintcare i did | sprintcare | sprintcare | sprintcare | sprintcare |
Getting the cleaned 'text' column after applying all the functions above. This is the clean dataset obtanied:
#Droping all the other columns of text cleaning and leaving the last one
clean_df = filtered_df.drop(columns=['text_lower','text_wo_punct','text_wo_stop','text_wo_common','text_wo_url','text'])
clean_df.rename(columns={'text_wo_emoji': 'text'}, inplace=True)
clean_df
| tweet_id | author_id | inbound | created_at | response_tweet_id | in_response_to_tweet_id | text | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | sprintcare | False | 2017-10-31 22:10:47+00:00 | 2 | 3.0 | 115712 understand would assist would need priv... |
| 1 | 2 | 115712 | True | 2017-10-31 22:11:45+00:00 | NaN | 1.0 | sprintcare propose |
| 2 | 3 | 115712 | True | 2017-10-31 22:08:27+00:00 | 1 | 4.0 | sprintcare sent several private messages one r... |
| 3 | 4 | sprintcare | False | 2017-10-31 21:54:49+00:00 | 3 | 5.0 | 115712 private message assist click ‘message’ ... |
| 4 | 5 | 115712 | True | 2017-10-31 21:49:35+00:00 | 4 | 6.0 | sprintcare |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2811769 | 2987947 | sprintcare | False | 2017-11-22 08:43:51+00:00 | NaN | 2987948.0 | 823869 hey wed happy direct message assist tn |
| 2811770 | 2987948 | 823869 | True | 2017-11-22 08:35:16+00:00 | 2987947 | NaN | 115714 wtf i’ve really shitty day shit together |
| 2811771 | 2812240 | 121673 | True | 2017-11-23 04:13:07+00:00 | NaN | 2812239.0 | 143549 sprintcare go httpstcov2tmhetl7q ask ad... |
| 2811772 | 2987949 | AldiUK | False | 2017-11-22 08:31:24+00:00 | NaN | 2987950.0 | 823870 sounds delicious sarah httpstco7uqpwyh1b6 |
| 2811773 | 2987950 | 823870 | True | 2017-11-21 22:01:04+00:00 | 2987951,2987949 | NaN | aldiuk warm sloe gin mince pies ice cream best... |
2804785 rows × 7 columns
Questions based on the data
Finding how many companies are in the data frame
#Grouping by author id and selecting the response_tweet_id', 'in_response_to_tweet_id
sort_table = clean_df.groupby(['author_id']).count().sort_values(by='tweet_id', ascending = False).loc[:, ['response_tweet_id', 'in_response_to_tweet_id']]
print(f'There are {len(sort_table)} companies in the dataset')
There are 702777 companies in the dataset
Selecting the top 20 companies based on the number of response_tweet_id and in_response_to_tweet_id
#Top 20 companies by number of tweets
bar_top20 = sort_table.head(20).sort_values(by='in_response_to_tweet_id', ascending = True).reset_index()
bar_top20['total_tweets'] = bar_top20['response_tweet_id'] + bar_top20['in_response_to_tweet_id']
bar_top20
| author_id | response_tweet_id | in_response_to_tweet_id | total_tweets | |
|---|---|---|---|---|
| 0 | ChipotleTweets | 5873 | 18612 | 24485 |
| 1 | AskPlayStation | 7661 | 18694 | 26355 |
| 2 | GWRHelp | 10845 | 19294 | 30139 |
| 3 | sainsburys | 8123 | 19417 | 27540 |
| 4 | hulu_support | 7616 | 21783 | 29399 |
| 5 | sprintcare | 7288 | 22335 | 29623 |
| 6 | XboxSupport | 10025 | 24341 | 34366 |
| 7 | Ask_Spectrum | 7640 | 25807 | 33447 |
| 8 | VirginTrains | 14940 | 27522 | 42462 |
| 9 | SouthwestAir | 8492 | 28889 | 37381 |
| 10 | British_Airways | 10053 | 29315 | 39368 |
| 11 | comcastcares | 7601 | 33007 | 40608 |
| 12 | TMobileHelp | 9729 | 34287 | 44016 |
| 13 | AmericanAir | 14390 | 36598 | 50988 |
| 14 | Tesco | 11076 | 38501 | 49577 |
| 15 | Delta | 11958 | 42197 | 54155 |
| 16 | SpotifyCares | 13764 | 43243 | 57007 |
| 17 | Uber_Support | 18027 | 56261 | 74288 |
| 18 | AppleSupport | 31423 | 106719 | 138142 |
| 19 | AmazonHelp | 84721 | 169287 | 254008 |
Creating a stack Bar chart to show the top 20 companies by number of tweets. Each bar has the percentage of tweets that are response_tweet_id(light blue) and in_response_to_tweet_id (steelblue)
fig = px.bar(bar_top20, y='author_id', x= ['response_tweet_id','in_response_to_tweet_id'],height=600 , text_auto=True)
fig.update_traces(textfont_size= 9)
fig.update_layout(
title='Top 20 companies by number of tweets',
yaxis_title='Company Name',
xaxis_title='Number of tweets',
)
fig.show()
Create a column 'only_date' extracting the date from 'created_at' column and ending with only date format YYMMDD
#creating a column only selecting the date from created_at
d = datetime.now()
#extracting only the YYMMDD
clean_df["only_date"] = [d.date() for d in clean_df["created_at"]]
clean_df
| tweet_id | author_id | inbound | created_at | response_tweet_id | in_response_to_tweet_id | text | only_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | sprintcare | False | 2017-10-31 22:10:47+00:00 | 2 | 3.0 | 115712 understand would assist would need priv... | 2017-10-31 |
| 1 | 2 | 115712 | True | 2017-10-31 22:11:45+00:00 | NaN | 1.0 | sprintcare propose | 2017-10-31 |
| 2 | 3 | 115712 | True | 2017-10-31 22:08:27+00:00 | 1 | 4.0 | sprintcare sent several private messages one r... | 2017-10-31 |
| 3 | 4 | sprintcare | False | 2017-10-31 21:54:49+00:00 | 3 | 5.0 | 115712 private message assist click ‘message’ ... | 2017-10-31 |
| 4 | 5 | 115712 | True | 2017-10-31 21:49:35+00:00 | 4 | 6.0 | sprintcare | 2017-10-31 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2811769 | 2987947 | sprintcare | False | 2017-11-22 08:43:51+00:00 | NaN | 2987948.0 | 823869 hey wed happy direct message assist tn | 2017-11-22 |
| 2811770 | 2987948 | 823869 | True | 2017-11-22 08:35:16+00:00 | 2987947 | NaN | 115714 wtf i’ve really shitty day shit together | 2017-11-22 |
| 2811771 | 2812240 | 121673 | True | 2017-11-23 04:13:07+00:00 | NaN | 2812239.0 | 143549 sprintcare go httpstcov2tmhetl7q ask ad... | 2017-11-23 |
| 2811772 | 2987949 | AldiUK | False | 2017-11-22 08:31:24+00:00 | NaN | 2987950.0 | 823870 sounds delicious sarah httpstco7uqpwyh1b6 | 2017-11-22 |
| 2811773 | 2987950 | 823870 | True | 2017-11-21 22:01:04+00:00 | 2987951,2987949 | NaN | aldiuk warm sloe gin mince pies ice cream best... | 2017-11-21 |
2804785 rows × 8 columns
Finding number of tweets by date.
by_date = clean_df.groupby(['only_date']).count().sort_values(by='tweet_id', ascending = False).loc[:, ['tweet_id']].reset_index()
by_date
| only_date | tweet_id | |
|---|---|---|
| 0 | 2017-11-07 | 62621 |
| 1 | 2017-10-27 | 58984 |
| 2 | 2017-11-08 | 58034 |
| 3 | 2017-11-06 | 57844 |
| 4 | 2017-11-03 | 57099 |
| ... | ... | ... |
| 951 | 2015-01-18 | 1 |
| 952 | 2015-01-14 | 1 |
| 953 | 2015-01-10 | 1 |
| 954 | 2016-03-06 | 1 |
| 955 | 2014-09-05 | 1 |
956 rows × 2 columns
Average of the number of tweets by date
by_date.mean()
C:\Users\lalac\AppData\Local\Temp\ipykernel_7560\2697763351.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
tweet_id 2933.875523 dtype: float64
Minimum number of tweets by date
by_date.min()
only_date 2008-05-08 tweet_id 1 dtype: object
Maximum number of tweets by date
by_date.max()
only_date 2017-12-03 tweet_id 62621 dtype: object
Finding date range minimum and maximum in the date set
clean_df['only_date'].min()
datetime.date(2008, 5, 8)
clean_df['only_date'].max()
datetime.date(2017, 12, 3)
type(clean_df['only_date'])
pandas.core.series.Series
Extract the year from the 'created_at' and created a new column 'year'
clean_df['year'] = clean_df['created_at'].dt.year
clean_df
| tweet_id | author_id | inbound | created_at | response_tweet_id | in_response_to_tweet_id | text | only_date | year | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | sprintcare | False | 2017-10-31 22:10:47+00:00 | 2 | 3.0 | 115712 understand would assist would need priv... | 2017-10-31 | 2017 |
| 1 | 2 | 115712 | True | 2017-10-31 22:11:45+00:00 | NaN | 1.0 | sprintcare propose | 2017-10-31 | 2017 |
| 2 | 3 | 115712 | True | 2017-10-31 22:08:27+00:00 | 1 | 4.0 | sprintcare sent several private messages one r... | 2017-10-31 | 2017 |
| 3 | 4 | sprintcare | False | 2017-10-31 21:54:49+00:00 | 3 | 5.0 | 115712 private message assist click ‘message’ ... | 2017-10-31 | 2017 |
| 4 | 5 | 115712 | True | 2017-10-31 21:49:35+00:00 | 4 | 6.0 | sprintcare | 2017-10-31 | 2017 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2811769 | 2987947 | sprintcare | False | 2017-11-22 08:43:51+00:00 | NaN | 2987948.0 | 823869 hey wed happy direct message assist tn | 2017-11-22 | 2017 |
| 2811770 | 2987948 | 823869 | True | 2017-11-22 08:35:16+00:00 | 2987947 | NaN | 115714 wtf i’ve really shitty day shit together | 2017-11-22 | 2017 |
| 2811771 | 2812240 | 121673 | True | 2017-11-23 04:13:07+00:00 | NaN | 2812239.0 | 143549 sprintcare go httpstcov2tmhetl7q ask ad... | 2017-11-23 | 2017 |
| 2811772 | 2987949 | AldiUK | False | 2017-11-22 08:31:24+00:00 | NaN | 2987950.0 | 823870 sounds delicious sarah httpstco7uqpwyh1b6 | 2017-11-22 | 2017 |
| 2811773 | 2987950 | 823870 | True | 2017-11-21 22:01:04+00:00 | 2987951,2987949 | NaN | aldiuk warm sloe gin mince pies ice cream best... | 2017-11-21 | 2017 |
2804785 rows × 9 columns
Number of tweets by tweet_id, response_tweet_id and in_response_to_tweet_id per year
#grouping by year to find the number of tweets for each year in the dataset
by_year = clean_df.groupby(['year']).count().sort_values(by='tweet_id', ascending = False).loc[:, ['tweet_id','response_tweet_id', 'in_response_to_tweet_id']]
by_year
| tweet_id | response_tweet_id | in_response_to_tweet_id | |
|---|---|---|---|
| year | |||
| 2017 | 2802572 | 1761944 | 2015802 |
| 2016 | 1481 | 1480 | 1164 |
| 2015 | 403 | 403 | 264 |
| 2014 | 184 | 184 | 125 |
| 2013 | 75 | 75 | 47 |
| 2012 | 52 | 52 | 31 |
| 2011 | 10 | 10 | 2 |
| 2010 | 6 | 6 | 2 |
| 2008 | 2 | 2 | 2 |
The year with more tweets is 2017
For now on, we are going to use the following data set just selecting the tweets for 2017. Because, that's the year that has the most data
# Define the start and end dates
start_date = pd.to_datetime('2017-01-01')
end_date = pd.to_datetime('2017-12-31')
# Select rows between the start and end dates
by_year = by_date.loc[(by_date['only_date'] >= start_date) & (by_date['only_date'] <= end_date)]
by_year
C:\Users\lalac\anaconda3\lib\site-packages\pandas\core\ops\array_ops.py:73: FutureWarning: Comparison of Timestamp with datetime.date is deprecated in order to match the standard library behavior. In a future version these will be considered non-comparable. Use 'ts == pd.Timestamp(date)' or 'ts.date() == date' instead.
| only_date | tweet_id | |
|---|---|---|
| 0 | 2017-11-07 | 62621 |
| 1 | 2017-10-27 | 58984 |
| 2 | 2017-11-08 | 58034 |
| 3 | 2017-11-06 | 57844 |
| 4 | 2017-11-03 | 57099 |
| ... | ... | ... |
| 781 | 2017-02-25 | 2 |
| 897 | 2017-03-15 | 1 |
| 908 | 2017-01-29 | 1 |
| 935 | 2017-01-28 | 1 |
| 942 | 2017-01-23 | 1 |
337 rows × 2 columns
fig = px.scatter(by_year, x="only_date", y="tweet_id")
fig.update_layout(
title='Tweets for 2017',
xaxis_title='Date',
yaxis_title='Number of tweets',
)
fig.show()
# Define the start and end dates
start_date_2017= pd.to_datetime('2017-10-01')
end_date_2017 = pd.to_datetime('2017-12-31')
# Select rows between the start and end dates
zoom_2017 = by_date.loc[(by_date['only_date'] >= start_date_2017) & (by_date['only_date'] <= end_date_2017)]
zoom_2017
C:\Users\lalac\anaconda3\lib\site-packages\pandas\core\ops\array_ops.py:73: FutureWarning: Comparison of Timestamp with datetime.date is deprecated in order to match the standard library behavior. In a future version these will be considered non-comparable. Use 'ts == pd.Timestamp(date)' or 'ts.date() == date' instead.
| only_date | tweet_id | |
|---|---|---|
| 0 | 2017-11-07 | 62621 |
| 1 | 2017-10-27 | 58984 |
| 2 | 2017-11-08 | 58034 |
| 3 | 2017-11-06 | 57844 |
| 4 | 2017-11-03 | 57099 |
| ... | ... | ... |
| 59 | 2017-11-12 | 20092 |
| 60 | 2017-11-18 | 11409 |
| 61 | 2017-10-03 | 7387 |
| 62 | 2017-10-02 | 3798 |
| 63 | 2017-10-01 | 1819 |
64 rows × 2 columns
fig = px.scatter(zoom_2017, x="only_date", y="tweet_id", trendline_scope="overall")
fig.update_layout(
title='Tweets between Oct - Dic 2017',
xaxis_title='Date',
yaxis_title='Number of tweets',
)
fig.show()
Filter data for 2017 and clean column year
#tweets sent by companies
customers = clean_df[clean_df['inbound'] == False].loc[:, ['tweet_id','author_id','created_at']]
customers
| tweet_id | author_id | created_at | |
|---|---|---|---|
| 0 | 1 | sprintcare | 2017-10-31 22:10:47+00:00 |
| 3 | 4 | sprintcare | 2017-10-31 21:54:49+00:00 |
| 5 | 6 | sprintcare | 2017-10-31 21:46:24+00:00 |
| 7 | 11 | sprintcare | 2017-10-31 22:10:35+00:00 |
| 9 | 15 | sprintcare | 2017-10-31 20:03:31+00:00 |
| ... | ... | ... | ... |
| 2811764 | 2987943 | AirAsiaSupport | 2017-11-22 07:54:57+00:00 |
| 2811766 | 139628 | ArgosHelpers | 2017-11-22 08:03:26+00:00 |
| 2811767 | 2987945 | VirginTrains | 2017-11-22 08:27:34+00:00 |
| 2811769 | 2987947 | sprintcare | 2017-11-22 08:43:51+00:00 |
| 2811772 | 2987949 | AldiUK | 2017-11-22 08:31:24+00:00 |
1266942 rows × 3 columns
#Selecting the tweets only received by customers
companies = clean_df[clean_df['inbound'] == True].loc[:, ['author_id','created_at','in_response_to_tweet_id']]
companies
| author_id | created_at | in_response_to_tweet_id | |
|---|---|---|---|
| 1 | 115712 | 2017-10-31 22:11:45+00:00 | 1.0 |
| 2 | 115712 | 2017-10-31 22:08:27+00:00 | 4.0 |
| 4 | 115712 | 2017-10-31 21:49:35+00:00 | 6.0 |
| 6 | 115712 | 2017-10-31 21:45:10+00:00 | NaN |
| 8 | 115713 | 2017-10-31 22:04:47+00:00 | 15.0 |
| ... | ... | ... | ... |
| 2811765 | 823868 | 2017-11-22 07:43:36+00:00 | NaN |
| 2811768 | 524544 | 2017-11-22 08:25:48+00:00 | NaN |
| 2811770 | 823869 | 2017-11-22 08:35:16+00:00 | NaN |
| 2811771 | 121673 | 2017-11-23 04:13:07+00:00 | 2812239.0 |
| 2811773 | 823870 | 2017-11-21 22:01:04+00:00 | NaN |
1537843 rows × 3 columns
Merge the two tables customers and companies based on tweet_id and in_response_to_tweet_id, in order to obtain the times the tweet was created and tweet was responded
merge_df = companies.merge(customers, left_on='in_response_to_tweet_id', right_on='tweet_id').dropna()
merge_df
| author_id_x | created_at_x | in_response_to_tweet_id | tweet_id | author_id_y | created_at_y | |
|---|---|---|---|---|---|---|
| 0 | 115712 | 2017-10-31 22:11:45+00:00 | 1.0 | 1 | sprintcare | 2017-10-31 22:10:47+00:00 |
| 1 | 115712 | 2017-10-31 22:08:27+00:00 | 4.0 | 4 | sprintcare | 2017-10-31 21:54:49+00:00 |
| 2 | 115712 | 2017-10-31 21:49:35+00:00 | 6.0 | 6 | sprintcare | 2017-10-31 21:46:24+00:00 |
| 3 | 115712 | 2017-10-31 21:47:48+00:00 | 6.0 | 6 | sprintcare | 2017-10-31 21:46:24+00:00 |
| 4 | 115713 | 2017-10-31 22:04:47+00:00 | 15.0 | 15 | sprintcare | 2017-10-31 20:03:31+00:00 |
| ... | ... | ... | ... | ... | ... | ... |
| 539795 | 823852 | 2017-11-30 08:03:53+00:00 | 2987894.0 | 2987894 | nationalrailenq | 2017-11-30 08:02:32+00:00 |
| 539796 | 136417 | 2017-11-22 06:58:50+00:00 | 2987897.0 | 2987897 | VirginTrains | 2017-11-22 06:57:43+00:00 |
| 539797 | 823858 | 2017-11-30 07:31:54+00:00 | 2987915.0 | 2987915 | CoxHelp | 2017-11-30 07:27:11+00:00 |
| 539798 | 823859 | 2017-11-30 08:19:49+00:00 | 2987918.0 | 2987918 | ArgosHelpers | 2017-11-30 07:58:42+00:00 |
| 539799 | 783956 | 2017-11-22 07:15:45+00:00 | 2811285.0 | 2811285 | Safaricom_Care | 2017-11-22 07:13:34+00:00 |
539800 rows × 6 columns
Creating a new column with the response time
merge_df['response_time'] = merge_df['created_at_x'] - merge_df['created_at_y']
merge_df
| author_id_x | created_at_x | in_response_to_tweet_id | tweet_id | author_id_y | created_at_y | response_time | |
|---|---|---|---|---|---|---|---|
| 0 | 115712 | 2017-10-31 22:11:45+00:00 | 1.0 | 1 | sprintcare | 2017-10-31 22:10:47+00:00 | 0 days 00:00:58 |
| 1 | 115712 | 2017-10-31 22:08:27+00:00 | 4.0 | 4 | sprintcare | 2017-10-31 21:54:49+00:00 | 0 days 00:13:38 |
| 2 | 115712 | 2017-10-31 21:49:35+00:00 | 6.0 | 6 | sprintcare | 2017-10-31 21:46:24+00:00 | 0 days 00:03:11 |
| 3 | 115712 | 2017-10-31 21:47:48+00:00 | 6.0 | 6 | sprintcare | 2017-10-31 21:46:24+00:00 | 0 days 00:01:24 |
| 4 | 115713 | 2017-10-31 22:04:47+00:00 | 15.0 | 15 | sprintcare | 2017-10-31 20:03:31+00:00 | 0 days 02:01:16 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 539795 | 823852 | 2017-11-30 08:03:53+00:00 | 2987894.0 | 2987894 | nationalrailenq | 2017-11-30 08:02:32+00:00 | 0 days 00:01:21 |
| 539796 | 136417 | 2017-11-22 06:58:50+00:00 | 2987897.0 | 2987897 | VirginTrains | 2017-11-22 06:57:43+00:00 | 0 days 00:01:07 |
| 539797 | 823858 | 2017-11-30 07:31:54+00:00 | 2987915.0 | 2987915 | CoxHelp | 2017-11-30 07:27:11+00:00 | 0 days 00:04:43 |
| 539798 | 823859 | 2017-11-30 08:19:49+00:00 | 2987918.0 | 2987918 | ArgosHelpers | 2017-11-30 07:58:42+00:00 | 0 days 00:21:07 |
| 539799 | 783956 | 2017-11-22 07:15:45+00:00 | 2811285.0 | 2811285 | Safaricom_Care | 2017-11-22 07:13:34+00:00 | 0 days 00:02:11 |
539800 rows × 7 columns
merge_df['response_time'].min()
Timedelta('0 days 00:00:00')
merge_df['response_time'].max()
Timedelta('2759 days 00:02:30')
merge_df['response_time'].mean()
Timedelta('1 days 07:00:42.071689514')
merge_df['response_time'].median()
Timedelta('0 days 00:12:03')
max_= merge_df[merge_df['response_time'] == '2759 days 00:02:30' ]
max_
| author_id_x | created_at_x | in_response_to_tweet_id | tweet_id | author_id_y | created_at_y | response_time | |
|---|---|---|---|---|---|---|---|
| 416270 | 665444 | 2017-10-19 16:55:57+00:00 | 2291018.0 | 2291018 | CarlsJr | 2010-03-31 16:53:27+00:00 | 2759 days 00:02:30 |
min_ = merge_df[merge_df['response_time'] == '0 days 00:00:00' ]
min_
| author_id_x | created_at_x | in_response_to_tweet_id | tweet_id | author_id_y | created_at_y | response_time | |
|---|---|---|---|---|---|---|---|
| 341471 | 533103 | 2017-10-17 22:42:51+00:00 | 1773808.0 | 1773808 | AmazonHelp | 2017-10-17 22:42:51+00:00 | 0 days |
median_= merge_df[merge_df['response_time'] == '0 days 00:12:03' ]
median_
| author_id_x | created_at_x | in_response_to_tweet_id | tweet_id | author_id_y | created_at_y | response_time | |
|---|---|---|---|---|---|---|---|
| 2739 | 119694 | 2017-11-22 14:16:37+00:00 | 16744.0 | 16744 | Ask_Spectrum | 2017-11-22 14:04:34+00:00 | 0 days 00:12:03 |
| 14592 | 133237 | 2017-11-30 12:50:30+00:00 | 77304.0 | 77304 | Ask_WellsFargo | 2017-11-30 12:38:27+00:00 | 0 days 00:12:03 |
| 15647 | 134321 | 2017-11-30 14:07:36+00:00 | 83367.0 | 83367 | AmazonHelp | 2017-11-30 13:55:33+00:00 | 0 days 00:12:03 |
| 16012 | 134791 | 2017-11-30 16:34:03+00:00 | 86065.0 | 86065 | AmazonHelp | 2017-11-30 16:22:00+00:00 | 0 days 00:12:03 |
| 18743 | 138306 | 2017-11-30 17:41:31+00:00 | 101832.0 | 101832 | AmazonHelp | 2017-11-30 17:29:28+00:00 | 0 days 00:12:03 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 511755 | 787867 | 2017-11-27 14:00:55+00:00 | 2829133.0 | 2829133 | VirginTrains | 2017-11-27 13:48:52+00:00 | 0 days 00:12:03 |
| 517121 | 794133 | 2017-11-13 09:19:30+00:00 | 2857322.0 | 2857322 | airtel_care | 2017-11-13 09:07:27+00:00 | 0 days 00:12:03 |
| 520319 | 798334 | 2017-11-28 05:41:16+00:00 | 2875723.0 | 2875723 | hulu_support | 2017-11-28 05:29:13+00:00 | 0 days 00:12:03 |
| 531289 | 794314 | 2017-11-29 10:06:03+00:00 | 2934377.0 | 2934377 | AmazonHelp | 2017-11-29 09:54:00+00:00 | 0 days 00:12:03 |
| 536635 | 814812 | 2017-11-29 22:00:58+00:00 | 2964959.0 | 2964959 | NikeSupport | 2017-11-29 21:48:55+00:00 | 0 days 00:12:03 |
130 rows × 7 columns
fig = px.scatter(merge_df, x="author_id_y", y="response_time")
fig.update_layout(
title='Response time for all companies',
xaxis_title='Companies',
yaxis_title='Response time',
)
fig.show()